!mkdir -p data
!wget https://www.dropbox.com/s/cwlaaxl70t27tb2/data_tutorial_buenrostro.tar.gz?dl=0 -O data/data_tutorial_buenrostro.tar.gz
!cd data; tar -xzf data_tutorial_buenrostro.tar.gz
!mkdir -p write
--2021-08-20 10:06:44-- https://www.dropbox.com/s/cwlaaxl70t27tb2/data_tutorial_buenrostro.tar.gz?dl=0 Resolving www.dropbox.com (www.dropbox.com)... 162.125.65.18, 2620:100:6022:18::a27d:4212 Connecting to www.dropbox.com (www.dropbox.com)|162.125.65.18|:443... connected. HTTP request sent, awaiting response... 301 Moved Permanently Location: /s/raw/cwlaaxl70t27tb2/data_tutorial_buenrostro.tar.gz [following] --2021-08-20 10:06:45-- https://www.dropbox.com/s/raw/cwlaaxl70t27tb2/data_tutorial_buenrostro.tar.gz Reusing existing connection to www.dropbox.com:443. HTTP request sent, awaiting response... 302 Found Location: https://uc459630d8c3600974a97b8bb644.dl.dropboxusercontent.com/cd/0/inline/BUl6i1A04oJeu9ft2kH6mzITaEB0LwBGK3GugtSbo5jCGCTe-iOi1froK0Rm5k4Y_ZGOEZSDOgwAQzNJb7HphaGeYUIXBYmXmmPYPfuhNJOgn4ld9bUhTvy1K1O97DuQ83fHIE69CbtGo9RAcW23t6Ce/file# [following] --2021-08-20 10:06:45-- https://uc459630d8c3600974a97b8bb644.dl.dropboxusercontent.com/cd/0/inline/BUl6i1A04oJeu9ft2kH6mzITaEB0LwBGK3GugtSbo5jCGCTe-iOi1froK0Rm5k4Y_ZGOEZSDOgwAQzNJb7HphaGeYUIXBYmXmmPYPfuhNJOgn4ld9bUhTvy1K1O97DuQ83fHIE69CbtGo9RAcW23t6Ce/file Resolving uc459630d8c3600974a97b8bb644.dl.dropboxusercontent.com (uc459630d8c3600974a97b8bb644.dl.dropboxusercontent.com)... 162.125.65.15, 2620:100:6022:15::a27d:420f Connecting to uc459630d8c3600974a97b8bb644.dl.dropboxusercontent.com (uc459630d8c3600974a97b8bb644.dl.dropboxusercontent.com)|162.125.65.15|:443... connected. HTTP request sent, awaiting response... 302 Found Location: /cd/0/inline2/BUl5afTGV8Hs3DgAXj3aYcN3bHhYbXbQPbQ_lfp1kDvKUdzFq4Eo_QFlWD5hXAy-ayu3jzD4O6SYYqkMJ7MRjABYdHwdQHunrwWetGfzKluV82YRFu4JEUAk3bLWUZzTvVyRUP1h8DRXt46ow6Fa6BWrQdvCVB5fZQE30yisOPOa494-21XpWMdaE6akZT1g45R6TJvXUd09TsfNyAsfixHKvw_nq3XormDVhd5XG1oI-f2SQAAEVfFDohokeG72qNAcsu-IXsrBFwznQ5FvNxjiq5baZPFFOWGurrE7lbjD27NjDokFh1G91rlIgvH8kaQH2QLrMwqG2Xug7nbUfy7jYexSB8hkVL_iWS9FCdijAlR8ChxDbPz_ZFzEDx8vPMs/file [following] --2021-08-20 10:06:46-- https://uc459630d8c3600974a97b8bb644.dl.dropboxusercontent.com/cd/0/inline2/BUl5afTGV8Hs3DgAXj3aYcN3bHhYbXbQPbQ_lfp1kDvKUdzFq4Eo_QFlWD5hXAy-ayu3jzD4O6SYYqkMJ7MRjABYdHwdQHunrwWetGfzKluV82YRFu4JEUAk3bLWUZzTvVyRUP1h8DRXt46ow6Fa6BWrQdvCVB5fZQE30yisOPOa494-21XpWMdaE6akZT1g45R6TJvXUd09TsfNyAsfixHKvw_nq3XormDVhd5XG1oI-f2SQAAEVfFDohokeG72qNAcsu-IXsrBFwznQ5FvNxjiq5baZPFFOWGurrE7lbjD27NjDokFh1G91rlIgvH8kaQH2QLrMwqG2Xug7nbUfy7jYexSB8hkVL_iWS9FCdijAlR8ChxDbPz_ZFzEDx8vPMs/file Reusing existing connection to uc459630d8c3600974a97b8bb644.dl.dropboxusercontent.com:443. HTTP request sent, awaiting response... 200 OK Length: 30829526 (29M) [application/octet-stream] Saving to: ‘data/data_tutorial_buenrostro.tar.gz’ data/data_tutorial_ 100%[===================>] 29.40M 16.7MB/s in 1.8s 2021-08-20 10:06:48 (16.7 MB/s) - ‘data/data_tutorial_buenrostro.tar.gz’ saved [30829526/30829526]
## load library
import scanpy as sc
import anndata as ad
import numpy as np
import pandas as pd
import episcanpy.api as epi
import scopen
sc.settings.set_figure_params(dpi=80, color_map='gist_earth')
results_file = 'write/buenrostro_pbmc.h5ad' # the file that will store the analysis results
adata = ad.read('data/data_tutorial_buenrostro/all_buenrostro_bulk_peaks.h5ad')
adata
AnnData object with n_obs × n_vars = 2034 × 491436
obs: 'batch', 'cell_name'
# display information stored in adata.obs
adata.obs
| batch | cell_name | |
|---|---|---|
| BM1077-MPP-Frozen-160105-1.dedup.st.bam | 0 | BM1077-MPP-Frozen-160105-1 |
| singles-20160726-scATAC-BM1137-GMP3high-HYC-88.dedup.st.bam | 0 | singles-20160726-scATAC-BM1137-GMP3high-HYC-88 |
| singles-160808-scATAC-BM1137-GMP2mid-LS-65.dedup.st.bam | 0 | singles-160808-scATAC-BM1137-GMP2mid-LS-65 |
| singles-BM0828-LMPP-frozen-151105-20.dedup.st.bam | 0 | singles-BM0828-LMPP-frozen-151105-20 |
| singles-160819-BM1137-CMP-LS-95.dedup.st.bam | 0 | singles-160819-BM1137-CMP-LS-95 |
| ... | ... | ... |
| BM1077-LMPP-Frozen-160107-40.dedup.st.bam | 1 | BM1077-LMPP-Frozen-160107-40 |
| BM1077-MPP-Frozen-160105-74.dedup.st.bam | 1 | BM1077-MPP-Frozen-160105-74 |
| singles-BM1214-GMP-160421-9.dedup.st.bam | 1 | singles-BM1214-GMP-160421-9 |
| singles-BM0828-LMPP-frozen-151105-62.dedup.st.bam | 1 | singles-BM0828-LMPP-frozen-151105-62 |
| singles-BM0828-MEP-160420-43.dedup.st.bam | 1 | singles-BM0828-MEP-160420-43 |
2034 rows × 2 columns
# checking the variable names (bulk peaks coordinates)
print(adata.var_names)
Index(['chr1_10279_10779', 'chr1_13252_13752', 'chr1_16019_16519',
'chr1_29026_29526', 'chr1_96364_96864', 'chr1_115440_115940',
'chr1_237535_238035', 'chr1_240811_241311', 'chr1_540469_540969',
'chr1_713909_714409',
...
'chrX_154822578_154823078', 'chrX_154840821_154841321',
'chrX_154841449_154841949', 'chrX_154841956_154842456',
'chrX_154842517_154843017', 'chrX_154862057_154862557',
'chrX_154870909_154871409', 'chrX_154880741_154881241',
'chrX_154891824_154892324', 'chrX_154896342_154896842'],
dtype='object', name='index', length=491436)
adata.obs['facs_label'] = ['MEP' if 'MEP' in line else line.split('.bam')[0].lstrip('singles-').split('BM')[-1].split('-')[1] for line in adata.obs['cell_name'].tolist()]
adata
AnnData object with n_obs × n_vars = 2034 × 491436
obs: 'batch', 'cell_name', 'facs_label'
!head 'data/data_tutorial_buenrostro/metadata.tsv'
label cell_type BM1077-CLP-Frozen-160106-13 CLP BM1077-CLP-Frozen-160106-14 CLP BM1077-CLP-Frozen-160106-2 CLP BM1077-CLP-Frozen-160106-21 CLP BM1077-CLP-Frozen-160106-27 CLP BM1077-CLP-Frozen-160106-3 CLP BM1077-CLP-Frozen-160106-36 CLP BM1077-CLP-Frozen-160106-42 CLP BM1077-CLP-Frozen-160106-44 CLP
# format the cell names to match the metadata file
adata.obs_names = [x.split('/')[-1].split('.dedup.st.bam')[0] for x in adata.obs_names.tolist()]
adata.obs_names
Index(['BM1077-MPP-Frozen-160105-1',
'singles-20160726-scATAC-BM1137-GMP3high-HYC-88',
'singles-160808-scATAC-BM1137-GMP2mid-LS-65',
'singles-BM0828-LMPP-frozen-151105-20',
'singles-160819-BM1137-CMP-LS-95', 'BM1077-MPP-Frozen-160105-36',
'singles-20160726-scATAC-BM1214-CMP-LS-84',
'BM1077-CMP-Frozen-160106-21', 'singles-BM0106-HSC-SIM-160219-36',
'singles-BM1214-GMP-160421-60',
...
'singles-160822-BM1137-CMP-LS-91',
'singles-BM0828-CMP-frozen-151118-69',
'singles-20160617-scATAC-BM1214-CMP-LS-40',
'singles-BM0828-GMP-151027-2',
'singles-20160726-scATAC-BM1214-CMP-LS-19',
'BM1077-LMPP-Frozen-160107-40', 'BM1077-MPP-Frozen-160105-74',
'singles-BM1214-GMP-160421-9', 'singles-BM0828-LMPP-frozen-151105-62',
'singles-BM0828-MEP-160420-43'],
dtype='object', length=2034)
epi.pp.load_metadata(adata,
'data/data_tutorial_buenrostro/metadata.tsv',
separator='\t')
adata
AnnData object with n_obs × n_vars = 2034 × 491436
obs: 'batch', 'cell_name', 'facs_label', 'label', 'cell_type'
# check the 2 different annotation obtained
pd.crosstab(adata.obs['cell_type'], adata.obs['facs_label'])
| facs_label | CLP | CMP | GMP | GMP1low | GMP2mid | GMP3high | HSC | LMPP | MCP | MEP | MPP | UNK | mono | pDC |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| cell_type | ||||||||||||||
| CLP | 78 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| CMP | 0 | 502 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| GMP | 0 | 0 | 216 | 68 | 44 | 74 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| HSC | 0 | 0 | 0 | 0 | 0 | 0 | 347 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| LMPP | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 160 | 0 | 0 | 0 | 0 | 0 | 0 |
| MEP | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 138 | 0 | 0 | 0 | 0 |
| MPP | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 142 | 0 | 0 | 0 |
| UNK | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 60 | 0 | 0 |
| mono | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 64 | 0 |
| pDC | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 73 | 0 | 0 | 0 | 0 | 68 |
Download annotation file (the data are aligned on hg19)
!wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_19/gencode.v19.annotation.gtf.gz -O data/data_tutorial_buenrostro/gencode.v19.annotation.gtf.gz
!cd data/data_tutorial_buenrostro; gunzip gencode.v19.annotation.gtf
--2021-08-20 10:06:52-- ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_19/gencode.v19.annotation.gtf.gz
=> ‘data/data_tutorial_buenrostro/gencode.v19.annotation.gtf.gz’
Resolving ftp.ebi.ac.uk (ftp.ebi.ac.uk)... 193.62.193.138
Connecting to ftp.ebi.ac.uk (ftp.ebi.ac.uk)|193.62.193.138|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done. ==> PWD ... done.
==> TYPE I ... done. ==> CWD (1) /pub/databases/gencode/Gencode_human/release_19 ... done.
==> SIZE gencode.v19.annotation.gtf.gz ... 37991892
==> PASV ... done. ==> RETR gencode.v19.annotation.gtf.gz ... done.
Length: 37991892 (36M) (unauthoritative)
gencode.v19.annotat 100%[===================>] 36.23M 17.5MB/s in 2.1s
2021-08-20 10:06:55 (17.5 MB/s) - ‘data/data_tutorial_buenrostro/gencode.v19.annotation.gtf.gz’ saved [37991892]
epi.tl.find_genes(adata,
gtf_file='data/data_tutorial_buenrostro/gencode.v19.annotation.gtf',
key_added='transcript_annotation',
upstream=2000,
feature_type='transcript',
annotation='HAVANA',
raw=False)
adata
AnnData object with n_obs × n_vars = 2034 × 491436
obs: 'batch', 'cell_name', 'facs_label', 'label', 'cell_type'
var: 'transcript_annotation'
Check if the data matrix is binary - if not, binarize the data matrix
print(np.max(adata.X))
1.0
# remove any potential empty features or barcodes
epi.pp.filter_cells(adata, min_features=1)
epi.pp.filter_features(adata, min_cells=1)
# filtered out 24210 peaks
adata
AnnData object with n_obs × n_vars = 2034 × 467226
obs: 'batch', 'cell_name', 'facs_label', 'label', 'cell_type', 'nb_features'
var: 'transcript_annotation', 'n_cells'
adata.obs['log_nb_features'] = [np.log10(x) for x in adata.obs['nb_features']]
adata
AnnData object with n_obs × n_vars = 2034 × 467226
obs: 'batch', 'cell_name', 'facs_label', 'label', 'cell_type', 'nb_features', 'log_nb_features'
var: 'transcript_annotation', 'n_cells'
epi.pl.violin(adata, ['nb_features'])
epi.pl.violin(adata, ['log_nb_features'])
FutureWarning:/home/rs619065/miniconda3/envs/scopen/lib/python3.8/site-packages/anndata/_core/anndata.py:1220: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object. ... storing 'facs_label' as categorical FutureWarning:/home/rs619065/miniconda3/envs/scopen/lib/python3.8/site-packages/anndata/_core/anndata.py:1220: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object. ... storing 'cell_type' as categorical FutureWarning:/home/rs619065/miniconda3/envs/scopen/lib/python3.8/site-packages/anndata/_core/anndata.py:1220: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object. ... storing 'transcript_annotation' as categorical
# set a minimum number of cells to keep
min_features = 1000
epi.pp.coverage_cells(adata, binary=True, log=False, bins=50,
threshold=min_features, save='Buenrostro_bulk_peaks_coverage_cells.png')
epi.pp.coverage_cells(adata, binary=True, log=10, bins=50,
threshold=min_features, save='Buenrostro_bulk_peaks_coverage_cells_log10.png')
# minimum number of cells sharing a feature
min_cells = 5
epi.pp.coverage_features(adata, binary=True, log=False,
threshold=min_cells, save='Buenrostro_bulk_peaks_coverage_peaks.png')
epi.pp.coverage_features(adata, binary=True, log=True,
threshold=min_cells, save='Buenrostro_bulk_peaks_coverage_peaks_log10.png')
min_features = 1000
epi.pp.filter_cells(adata, min_features=min_features)
adata
AnnData object with n_obs × n_vars = 1945 × 467226
obs: 'batch', 'cell_name', 'facs_label', 'label', 'cell_type', 'nb_features', 'log_nb_features'
var: 'transcript_annotation', 'n_cells', 'commonness'
min_cells = 5
epi.pp.filter_features(adata, min_cells=min_cells)
adata
AnnData object with n_obs × n_vars = 1945 × 311035
obs: 'batch', 'cell_name', 'facs_label', 'label', 'cell_type', 'nb_features', 'log_nb_features'
var: 'transcript_annotation', 'n_cells', 'commonness'
epi.pp.coverage_cells(adata, binary=True, log='log10', bins=50, threshold=min_features)
epi.pp.coverage_features(adata, binary=True, log='log10', bins=50, threshold=min_cells)
We aim to select a cuttof after the elbow.
epi.pp.cal_var(adata)
FutureWarning:/home/rs619065/miniconda3/envs/scopen/lib/python3.8/site-packages/seaborn/distributions.py:2557: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). FutureWarning:/home/rs619065/miniconda3/envs/scopen/lib/python3.8/site-packages/seaborn/distributions.py:2557: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
min_score_value = 0.515
nb_feature_selected = 120000
epi.pl.variability_features(adata,log=None,
min_score=min_score_value, nb_features=nb_feature_selected,
save='variability_features_plot_bonemarrow_peakmatrix.png')
epi.pl.variability_features(adata,log='log10',
min_score=min_score_value, nb_features=nb_feature_selected,
save='variability_features_plot_bonemarrow_peakmatrix_log10.png')
# save the current matrix in the raw layer
adata.raw = adata
# create a new AnnData containing only the most variable features
adata = epi.pp.select_var_feature(adata,
nb_features=nb_feature_selected,
show=False,
copy=True)
adata
View of AnnData object with n_obs × n_vars = 1945 × 122511
obs: 'batch', 'cell_name', 'facs_label', 'label', 'cell_type', 'nb_features', 'log_nb_features'
var: 'transcript_annotation', 'n_cells', 'commonness', 'prop_shared_cells', 'variability_score'
epi.pl.violin(adata, ['nb_features'])
epi.pl.violin(adata, ['log_nb_features'])
adata
View of AnnData object with n_obs × n_vars = 1945 × 122511
obs: 'batch', 'cell_name', 'facs_label', 'label', 'cell_type', 'nb_features', 'log_nb_features'
var: 'transcript_annotation', 'n_cells', 'commonness', 'prop_shared_cells', 'variability_score'
epi.pp.filter_cells(adata, min_features=2000)
epi.pp.filter_cells(adata, max_features=25000)
Trying to set attribute `.obs` of view, copying.
adata
AnnData object with n_obs × n_vars = 1722 × 122511
obs: 'batch', 'cell_name', 'facs_label', 'label', 'cell_type', 'nb_features', 'log_nb_features'
var: 'transcript_annotation', 'n_cells', 'commonness', 'prop_shared_cells', 'variability_score'
epi.pl.violin(adata, ['nb_features'])
epi.pl.violin(adata, ['log_nb_features'])
from scopen.Main import scopen_dr
adata.obsm['X_pca'] = np.transpose(scopen_dr(np.transpose(adata.X)))
08/20/2021 10:08:30, running tf-idf transformation... 08/20/2021 10:08:44, iteration: 0, violation: 1.00000000 08/20/2021 10:08:45, iteration: 1, violation: 0.23809547 08/20/2021 10:08:45, iteration: 2, violation: 0.15862548 08/20/2021 10:08:45, iteration: 3, violation: 0.10376727 08/20/2021 10:08:46, iteration: 4, violation: 0.06349803 08/20/2021 10:08:46, iteration: 5, violation: 0.04762651 08/20/2021 10:08:47, iteration: 6, violation: 0.04229391 08/20/2021 10:08:47, iteration: 7, violation: 0.03723540 08/20/2021 10:08:48, iteration: 8, violation: 0.03254635 08/20/2021 10:08:48, iteration: 9, violation: 0.02844828 08/20/2021 10:08:48, iteration: 10, violation: 0.02518969 08/20/2021 10:08:49, iteration: 11, violation: 0.02276936 08/20/2021 10:08:49, iteration: 12, violation: 0.02096878 08/20/2021 10:08:50, iteration: 13, violation: 0.01953172 08/20/2021 10:08:50, iteration: 14, violation: 0.01826928 08/20/2021 10:08:50, iteration: 15, violation: 0.01709340 08/20/2021 10:08:51, iteration: 16, violation: 0.01597418 08/20/2021 10:08:51, iteration: 17, violation: 0.01491398 08/20/2021 10:08:52, iteration: 18, violation: 0.01391376 08/20/2021 10:08:52, iteration: 19, violation: 0.01298998 08/20/2021 10:08:52, iteration: 20, violation: 0.01215548 08/20/2021 10:08:53, iteration: 21, violation: 0.01141767 08/20/2021 10:08:53, iteration: 22, violation: 0.01077700 08/20/2021 10:08:54, iteration: 23, violation: 0.01021965 08/20/2021 10:08:54, iteration: 24, violation: 0.00973250 08/20/2021 10:08:54, iteration: 25, violation: 0.00930043 08/20/2021 10:08:55, iteration: 26, violation: 0.00890897 08/20/2021 10:08:55, iteration: 27, violation: 0.00855066 08/20/2021 10:08:56, iteration: 28, violation: 0.00821511 08/20/2021 10:08:56, iteration: 29, violation: 0.00790385 08/20/2021 10:08:56, iteration: 30, violation: 0.00761203 08/20/2021 10:08:57, iteration: 31, violation: 0.00733954 08/20/2021 10:08:57, iteration: 32, violation: 0.00708217 08/20/2021 10:08:58, iteration: 33, violation: 0.00683490 08/20/2021 10:08:58, iteration: 34, violation: 0.00659618 08/20/2021 10:08:58, iteration: 35, violation: 0.00636058 08/20/2021 10:08:59, iteration: 36, violation: 0.00612942 08/20/2021 10:08:59, iteration: 37, violation: 0.00589915 08/20/2021 10:09:00, iteration: 38, violation: 0.00566925 08/20/2021 10:09:00, iteration: 39, violation: 0.00543857 08/20/2021 10:09:00, iteration: 40, violation: 0.00521031 08/20/2021 10:09:01, iteration: 41, violation: 0.00498624 08/20/2021 10:09:01, iteration: 42, violation: 0.00476971 08/20/2021 10:09:02, iteration: 43, violation: 0.00455888 08/20/2021 10:09:02, iteration: 44, violation: 0.00435613 08/20/2021 10:09:02, iteration: 45, violation: 0.00416258 08/20/2021 10:09:03, iteration: 46, violation: 0.00397863 08/20/2021 10:09:03, iteration: 47, violation: 0.00380585 08/20/2021 10:09:04, iteration: 48, violation: 0.00364409 08/20/2021 10:09:04, iteration: 49, violation: 0.00349495 08/20/2021 10:09:04, iteration: 50, violation: 0.00335874 08/20/2021 10:09:05, iteration: 51, violation: 0.00323439 08/20/2021 10:09:05, iteration: 52, violation: 0.00311983 08/20/2021 10:09:06, iteration: 53, violation: 0.00301411 08/20/2021 10:09:06, iteration: 54, violation: 0.00291534 08/20/2021 10:09:06, iteration: 55, violation: 0.00282238 08/20/2021 10:09:07, iteration: 56, violation: 0.00273499 08/20/2021 10:09:07, iteration: 57, violation: 0.00265252 08/20/2021 10:09:08, iteration: 58, violation: 0.00257507 08/20/2021 10:09:08, iteration: 59, violation: 0.00250194 08/20/2021 10:09:08, iteration: 60, violation: 0.00243270 08/20/2021 10:09:09, iteration: 61, violation: 0.00236747 08/20/2021 10:09:09, iteration: 62, violation: 0.00230589 08/20/2021 10:09:10, iteration: 63, violation: 0.00224785 08/20/2021 10:09:10, iteration: 64, violation: 0.00219304 08/20/2021 10:09:10, iteration: 65, violation: 0.00214112 08/20/2021 10:09:11, iteration: 66, violation: 0.00209175 08/20/2021 10:09:11, iteration: 67, violation: 0.00204503 08/20/2021 10:09:12, iteration: 68, violation: 0.00200065 08/20/2021 10:09:12, iteration: 69, violation: 0.00195797 08/20/2021 10:09:12, iteration: 70, violation: 0.00191670 08/20/2021 10:09:13, iteration: 71, violation: 0.00187690 08/20/2021 10:09:13, iteration: 72, violation: 0.00183782 08/20/2021 10:09:14, iteration: 73, violation: 0.00179931 08/20/2021 10:09:14, iteration: 74, violation: 0.00176117 08/20/2021 10:09:14, iteration: 75, violation: 0.00172332 08/20/2021 10:09:15, iteration: 76, violation: 0.00168517 08/20/2021 10:09:15, iteration: 77, violation: 0.00164690 08/20/2021 10:09:16, iteration: 78, violation: 0.00160820 08/20/2021 10:09:16, iteration: 79, violation: 0.00156930 08/20/2021 10:09:16, iteration: 80, violation: 0.00153021 08/20/2021 10:09:17, iteration: 81, violation: 0.00149059 08/20/2021 10:09:17, iteration: 82, violation: 0.00145038 08/20/2021 10:09:18, iteration: 83, violation: 0.00140987 08/20/2021 10:09:18, iteration: 84, violation: 0.00136918 08/20/2021 10:09:18, iteration: 85, violation: 0.00132823 08/20/2021 10:09:19, iteration: 86, violation: 0.00128681 08/20/2021 10:09:19, iteration: 87, violation: 0.00124540 08/20/2021 10:09:19, iteration: 88, violation: 0.00120380 08/20/2021 10:09:20, iteration: 89, violation: 0.00116222 08/20/2021 10:09:20, iteration: 90, violation: 0.00112061 08/20/2021 10:09:21, iteration: 91, violation: 0.00107984 08/20/2021 10:09:21, iteration: 92, violation: 0.00104014 08/20/2021 10:09:21, iteration: 93, violation: 0.00100163 08/20/2021 10:09:22, iteration: 94, violation: 0.00096499 08/20/2021 10:09:22, iteration: 95, violation: 0.00093034 08/20/2021 10:09:23, iteration: 96, violation: 0.00089803 08/20/2021 10:09:23, iteration: 97, violation: 0.00086791 08/20/2021 10:09:23, iteration: 98, violation: 0.00084000 08/20/2021 10:09:24, iteration: 99, violation: 0.00081409 08/20/2021 10:09:24, iteration: 100, violation: 0.00078986 08/20/2021 10:09:25, iteration: 101, violation: 0.00076719 08/20/2021 10:09:25, iteration: 102, violation: 0.00074589 08/20/2021 10:09:25, iteration: 103, violation: 0.00072579 08/20/2021 10:09:26, iteration: 104, violation: 0.00070673 08/20/2021 10:09:26, iteration: 105, violation: 0.00068862 08/20/2021 10:09:27, iteration: 106, violation: 0.00067129 08/20/2021 10:09:27, iteration: 107, violation: 0.00065468 08/20/2021 10:09:27, iteration: 108, violation: 0.00063877 08/20/2021 10:09:28, iteration: 109, violation: 0.00062346 08/20/2021 10:09:28, iteration: 110, violation: 0.00060870 08/20/2021 10:09:29, iteration: 111, violation: 0.00059447 08/20/2021 10:09:29, iteration: 112, violation: 0.00058075 08/20/2021 10:09:29, iteration: 113, violation: 0.00056743 08/20/2021 10:09:30, iteration: 114, violation: 0.00055454 08/20/2021 10:09:30, iteration: 115, violation: 0.00054205 08/20/2021 10:09:31, iteration: 116, violation: 0.00052990 08/20/2021 10:09:31, iteration: 117, violation: 0.00051814 08/20/2021 10:09:31, iteration: 118, violation: 0.00050670 08/20/2021 10:09:32, iteration: 119, violation: 0.00049561 08/20/2021 10:09:32, iteration: 120, violation: 0.00048484 08/20/2021 10:09:33, iteration: 121, violation: 0.00047441 08/20/2021 10:09:33, iteration: 122, violation: 0.00046425 08/20/2021 10:09:33, iteration: 123, violation: 0.00045440 08/20/2021 10:09:34, iteration: 124, violation: 0.00044486 08/20/2021 10:09:34, iteration: 125, violation: 0.00043552 08/20/2021 10:09:35, iteration: 126, violation: 0.00042647 08/20/2021 10:09:35, iteration: 127, violation: 0.00041770 08/20/2021 10:09:35, iteration: 128, violation: 0.00040919 08/20/2021 10:09:36, iteration: 129, violation: 0.00040091 08/20/2021 10:09:36, iteration: 130, violation: 0.00039288 08/20/2021 10:09:37, iteration: 131, violation: 0.00038507 08/20/2021 10:09:37, iteration: 132, violation: 0.00037749 08/20/2021 10:09:37, iteration: 133, violation: 0.00037014 08/20/2021 10:09:38, iteration: 134, violation: 0.00036299 08/20/2021 10:09:38, iteration: 135, violation: 0.00035606 08/20/2021 10:09:39, iteration: 136, violation: 0.00034932 08/20/2021 10:09:39, iteration: 137, violation: 0.00034277 08/20/2021 10:09:39, iteration: 138, violation: 0.00033640 08/20/2021 10:09:40, iteration: 139, violation: 0.00033021 08/20/2021 10:09:40, iteration: 140, violation: 0.00032421 08/20/2021 10:09:41, iteration: 141, violation: 0.00031837 08/20/2021 10:09:41, iteration: 142, violation: 0.00031270 08/20/2021 10:09:41, iteration: 143, violation: 0.00030719 08/20/2021 10:09:42, iteration: 144, violation: 0.00030184 08/20/2021 10:09:42, iteration: 145, violation: 0.00029666 08/20/2021 10:09:43, iteration: 146, violation: 0.00029162 08/20/2021 10:09:43, iteration: 147, violation: 0.00028672 08/20/2021 10:09:43, iteration: 148, violation: 0.00028195 08/20/2021 10:09:44, iteration: 149, violation: 0.00027731 08/20/2021 10:09:44, iteration: 150, violation: 0.00027281 08/20/2021 10:09:45, iteration: 151, violation: 0.00026845 08/20/2021 10:09:45, iteration: 152, violation: 0.00026420 08/20/2021 10:09:45, iteration: 153, violation: 0.00026007 08/20/2021 10:09:46, iteration: 154, violation: 0.00025607 08/20/2021 10:09:46, iteration: 155, violation: 0.00025218 08/20/2021 10:09:47, iteration: 156, violation: 0.00024841 08/20/2021 10:09:47, iteration: 157, violation: 0.00024475 08/20/2021 10:09:47, iteration: 158, violation: 0.00024118 08/20/2021 10:09:48, iteration: 159, violation: 0.00023773 08/20/2021 10:09:48, iteration: 160, violation: 0.00023437 08/20/2021 10:09:49, iteration: 161, violation: 0.00023110 08/20/2021 10:09:49, iteration: 162, violation: 0.00022792 08/20/2021 10:09:49, iteration: 163, violation: 0.00022482 08/20/2021 10:09:50, iteration: 164, violation: 0.00022178 08/20/2021 10:09:50, iteration: 165, violation: 0.00021883 08/20/2021 10:09:51, iteration: 166, violation: 0.00021595 08/20/2021 10:09:51, iteration: 167, violation: 0.00021315 08/20/2021 10:09:51, iteration: 168, violation: 0.00021041 08/20/2021 10:09:52, iteration: 169, violation: 0.00020776 08/20/2021 10:09:52, iteration: 170, violation: 0.00020517 08/20/2021 10:09:53, iteration: 171, violation: 0.00020266 08/20/2021 10:09:53, iteration: 172, violation: 0.00020022 08/20/2021 10:09:53, iteration: 173, violation: 0.00019785 08/20/2021 10:09:54, iteration: 174, violation: 0.00019554 08/20/2021 10:09:54, iteration: 175, violation: 0.00019330 08/20/2021 10:09:54, iteration: 176, violation: 0.00019111 08/20/2021 10:09:55, iteration: 177, violation: 0.00018899 08/20/2021 10:09:55, iteration: 178, violation: 0.00018692 08/20/2021 10:09:56, iteration: 179, violation: 0.00018492 08/20/2021 10:09:56, iteration: 180, violation: 0.00018297 08/20/2021 10:09:56, iteration: 181, violation: 0.00018107 08/20/2021 10:09:57, iteration: 182, violation: 0.00017923 08/20/2021 10:09:57, iteration: 183, violation: 0.00017744 08/20/2021 10:09:58, iteration: 184, violation: 0.00017570 08/20/2021 10:09:58, iteration: 185, violation: 0.00017400 08/20/2021 10:09:58, iteration: 186, violation: 0.00017235 08/20/2021 10:09:59, iteration: 187, violation: 0.00017074 08/20/2021 10:09:59, iteration: 188, violation: 0.00016917 08/20/2021 10:10:00, iteration: 189, violation: 0.00016764 08/20/2021 10:10:00, iteration: 190, violation: 0.00016615 08/20/2021 10:10:00, iteration: 191, violation: 0.00016470 08/20/2021 10:10:01, iteration: 192, violation: 0.00016328 08/20/2021 10:10:01, iteration: 193, violation: 0.00016189 08/20/2021 10:10:02, iteration: 194, violation: 0.00016054 08/20/2021 10:10:02, iteration: 195, violation: 0.00015922 08/20/2021 10:10:02, iteration: 196, violation: 0.00015794 08/20/2021 10:10:03, iteration: 197, violation: 0.00015670 08/20/2021 10:10:03, iteration: 198, violation: 0.00015548 08/20/2021 10:10:04, iteration: 199, violation: 0.00015430 08/20/2021 10:10:04, ranks: 30, fitting error: 39.61248220278519
epi.pp.neighbors(adata)
epi.tl.umap(adata)
WARNING: .obsp["connectivities"] have not been computed using umap
epi.pl.umap(adata, color=['nb_features', 'log_nb_features', 'cell_type'], wspace=0.3)
epi.tl.louvain(adata)
epi.pl.umap(adata, color=['louvain', 'cell_type', 'facs_label'], wspace=0.4)
epi.tl.getNClusters(adata, n_cluster=14)
step 0 got 17 at resolution 1.5 step 1 got 12 at resolution 0.75 step 2 got 15 at resolution 1.125 step 3 got 14 at resolution 0.9375
epi.pl.umap(adata, color=['louvain', 'cell_type', 'facs_label'], wspace=0.4)
epi.tl.kmeans(adata, num_clusters=14)
epi.pl.umap(adata, color=['kmeans', 'cell_type', 'facs_label'], wspace=0.4)
epi.tl.hc(adata, num_clusters=14)
epi.pl.umap(adata, color=['hc', 'cell_type', 'facs_label'], wspace=0.4)
epi.tl.leiden(adata)
epi.pl.umap(adata, color=['leiden', 'cell_type', 'facs_label'], wspace=0.4)
epi.tl.getNClusters(adata, n_cluster=14, method='leiden')
step 0 got 18 at resolution 1.5 step 1 got 13 at resolution 0.75 step 2 got 16 at resolution 1.125 step 3 got 16 at resolution 0.9375 step 4 got 13 at resolution 0.84375 step 5 got 15 at resolution 0.890625 step 6 got 13 at resolution 0.8671875 step 7 got 15 at resolution 0.87890625 step 8 got 15 at resolution 0.873046875 step 9 got 15 at resolution 0.8701171875 step 10 got 13 at resolution 0.86865234375 step 11 got 13 at resolution 0.869384765625 step 12 got 15 at resolution 0.8697509765625 step 13 got 15 at resolution 0.86956787109375 step 14 got 15 at resolution 0.869476318359375 step 15 got 13 at resolution 0.8694305419921875 step 16 got 13 at resolution 0.8694534301757812 step 17 got 15 at resolution 0.8694648742675781 step 18 got 15 at resolution 0.8694591522216797 step 19 got 13 at resolution 0.8694562911987305
epi.pl.umap(adata, color=['leiden', 'cell_type', 'facs_label'], wspace=0.4)
# True labels
epi.pl.umap(adata, color=['cell_type', 'facs_label'], wspace=0.4)
# Clusters
epi.pl.umap(adata, color=['louvain', 'kmeans', 'hc', 'leiden'], wspace=0.4)
For all these methods. The best value possible is 1.
1) Compute the Adjusted Rand Index for the different clustering to determine which one perform best. It computes a similarity measure between two clusterings (predicted and true labels)by counting cells that are assigned in the same or different clusters between the two clusterings.
print('louvain:\t', epi.tl.ARI(adata, 'louvain', 'cell_type'))
print('kmeans:\t', epi.tl.ARI(adata, 'kmeans', 'cell_type'))
print('hc:\t', epi.tl.ARI(adata, 'hc', 'cell_type'))
print('leiden:\t', epi.tl.ARI(adata, 'leiden', 'cell_type'))
louvain: 0.5026830907128316 kmeans: 0.47398964189429343 hc: 0.4443018268622743 leiden: 0.494350768736209
2) Compute the Homogeneity score. The score is higher when the different clusters contain only cells with the same ground truth label
print('louvain:\t', epi.tl.homogeneity(adata, 'louvain', 'cell_type'))
print('kmeans:\t', epi.tl.homogeneity(adata, 'kmeans', 'cell_type'))
print('hc:\t', epi.tl.homogeneity(adata, 'hc', 'cell_type'))
print('leiden:\t', epi.tl.homogeneity(adata, 'leiden', 'cell_type'))
louvain: 0.620855120000249 kmeans: 0.607717394358658 hc: 0.6095090801233856 leiden: 0.6247759815798529
3) Compute the Adjusted Mutual Information, it is measure of the similarity between two labels of the same data, while accounting for chance (the Mutual information is generally higher for two set of labels with a large number of clusters)
print('louvain:\t', epi.tl.AMI(adata, 'louvain', 'cell_type'))
print('kmeans:\t', epi.tl.AMI(adata, 'kmeans', 'cell_type'))
print('hc:\t', epi.tl.AMI(adata, 'hc', 'cell_type'))
print('leiden:\t', epi.tl.AMI(adata, 'leiden', 'cell_type'))
louvain: 0.6756279946014332 kmeans: 0.6371966070741591 hc: 0.6380117508290446 leiden: 0.664802449730221
## Save the processed Anndata
adata.write(results_file)
adata = ad.read(results_file)
adata
AnnData object with n_obs × n_vars = 1722 × 122511
obs: 'batch', 'cell_name', 'facs_label', 'label', 'cell_type', 'nb_features', 'log_nb_features', 'louvain', 'kmeans', 'hc', 'leiden'
var: 'transcript_annotation', 'n_cells', 'commonness', 'prop_shared_cells', 'variability_score'
uns: 'cell_type_colors', 'facs_label_colors', 'hc_colors', 'kmeans_colors', 'leiden', 'leiden_colors', 'louvain', 'louvain_colors', 'neighbors', 'umap'
obsm: 'X_pca', 'X_umap'
obsp: 'connectivities', 'distances'
epi.tl.rank_features(adata, 'louvain', omic='ATAC')
WARNING: It seems you use rank_genes_groups on the raw count data. Please logarithmize your data before calling rank_genes_groups.
epi.pl.rank_feat_groups(adata)
epi.pl.rank_feat_groups(adata, feature_symbols='transcript_annotation')
epi.pl.rank_feat_groups_matrixplot(adata)
WARNING: dendrogram data not found (using key=dendrogram_louvain). Running `sc.tl.dendrogram` with default parameters. For fine tuning it is recommended to run `sc.tl.dendrogram` independently.